{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "81cca085-dba2-42eb-a13b-fa64b6e86583",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "24/12/11 15:50:40 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----------+----------+--------+--------------------+----------+--------------------+-------------------+\n",
      "|    user_id| device_id|referrer|                host|       url|          event_time|         event_date|\n",
      "+-----------+----------+--------+--------------------+----------+--------------------+-------------------+\n",
      "| 1037710827| 532630305|    NULL| www.zachwilson.tech|         /|2021-03-08 17:27:...|2021-03-08 00:00:00|\n",
      "|  925588856| 532630305|    NULL|    www.eczachly.com|         /|2021-05-10 11:26:...|2021-05-10 00:00:00|\n",
      "|-1180485268| 532630305|    NULL|admin.zachwilson....|         /|2021-02-17 16:19:...|2021-02-17 00:00:00|\n",
      "|-1044833855| 532630305|    NULL| www.zachwilson.tech|         /|2021-09-24 15:53:...|2021-09-24 00:00:00|\n",
      "|  747494706| 532630305|    NULL| www.zachwilson.tech|         /|2021-09-26 16:03:...|2021-09-26 00:00:00|\n",
      "|  747494706| 532630305|    NULL|admin.zachwilson....|         /|2021-02-21 16:08:...|2021-02-21 00:00:00|\n",
      "| -824540328| 532630305|    NULL|admin.zachwilson....|         /|2021-09-28 17:23:...|2021-09-28 00:00:00|\n",
      "| -824540328| 532630305|    NULL|    www.eczachly.com|         /|2021-09-29 01:22:...|2021-09-29 00:00:00|\n",
      "| 1833036683| 532630305|    NULL|admin.zachwilson....|         /|2021-01-24 03:15:...|2021-01-24 00:00:00|\n",
      "|-2134824313| 532630305|    NULL|    www.eczachly.com|         /|2021-01-25 00:03:...|2021-01-25 00:00:00|\n",
      "|-1809929467|-906264142|    NULL|admin.zachwilson....|/.git/HEAD|2021-02-22 01:36:...|2021-02-22 00:00:00|\n",
      "| 2002285749|-906264142|    NULL|    www.eczachly.com|         /|2021-02-22 02:25:...|2021-02-22 00:00:00|\n",
      "|-1562965412| 532630305|    NULL| www.zachwilson.tech|         /|2021-01-30 20:46:...|2021-01-30 00:00:00|\n",
      "|-1099860451| 532630305|    NULL|    www.eczachly.com|         /|2021-02-04 23:49:...|2021-02-04 00:00:00|\n",
      "| 1246896869|-906264142|    NULL| www.zachwilson.tech|         /|2021-02-22 02:50:...|2021-02-22 00:00:00|\n",
      "| -629331502|-906264142|    NULL|admin.zachwilson....|/.git/HEAD|2021-02-22 23:51:...|2021-02-22 00:00:00|\n",
      "|-1913422462|-906264142|    NULL|    www.eczachly.com|         /|2021-02-23 00:17:...|2021-02-23 00:00:00|\n",
      "|   50429624| 532630305|    NULL|    www.eczachly.com|         /|2022-12-28 01:38:...|2022-12-28 00:00:00|\n",
      "|  222389292| 532630305|    NULL| www.zachwilson.tech|         /|2022-12-28 05:23:...|2022-12-28 00:00:00|\n",
      "| -779924777| 532630305|    NULL| www.zachwilson.tech|         /|2022-12-28 16:45:...|2022-12-28 00:00:00|\n",
      "+-----------+----------+--------+--------------------+----------+--------------------+-------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from pyspark.sql import SparkSession\n",
    "from pyspark.sql.functions import expr, col\n",
    "spark = SparkSession.builder.appName(\"Jupyter\").getOrCreate()\n",
    "\n",
    "spark\n",
    "\n",
    "events = spark.read.option(\"header\", \"true\").csv(\"/home/iceberg/data/events.csv\").withColumn(\"event_date\", expr(\"DATE_TRUNC('day', event_time)\"))\n",
    "devices = spark.read.option(\"header\",\"true\").csv(\"/home/iceberg/data/devices.csv\")\n",
    "\n",
    "df = events.join(devices,on=\"device_id\",how=\"left\")\n",
    "df = df.withColumnsRenamed({'browser_type': 'browser_family', 'os_type': 'os_family'})\n",
    "\n",
    "df.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "dce068df-3e21-429a-8716-abdd13e9406c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----------+-----------+--------------------+--------------------+--------------------+--------------------+-------------------+\n",
      "|    user_id|  device_id|            referrer|                host|                 url|          event_time|         event_date|\n",
      "+-----------+-----------+--------------------+--------------------+--------------------+--------------------+-------------------+\n",
      "| 1129583063|  532630305|                NULL|admin.zachwilson....|                   /|2021-01-07 09:21:...|2021-01-07 00:00:00|\n",
      "| -648945006| 1088283544|                NULL|    www.eczachly.com|                   /|2021-01-07 02:58:...|2021-01-07 00:00:00|\n",
      "|-1871780024| -158310583|                NULL|    www.eczachly.com|                   /|2021-01-07 04:17:...|2021-01-07 00:00:00|\n",
      "|  203689086| 1088283544|                NULL|    www.eczachly.com|/blog/what-exactl...|2021-01-07 10:03:...|2021-01-07 00:00:00|\n",
      "|-1180485268|  532630305|                NULL|    www.eczachly.com|                   /|2021-01-07 18:45:...|2021-01-07 00:00:00|\n",
      "| 1129583063|  532630305|                NULL|    www.eczachly.com|                   /|2021-01-07 21:57:...|2021-01-07 00:00:00|\n",
      "|-1381834161| -158310583|                NULL|    www.eczachly.com|                   /|2021-01-07 23:07:...|2021-01-07 00:00:00|\n",
      "|-1373330946|  532630305|                NULL| www.zachwilson.tech|  /api/v1/spark-post|2021-01-07 18:53:...|2021-01-07 00:00:00|\n",
      "|-1180485268|  532630305|                NULL| www.zachwilson.tech|                   /|2021-01-07 19:20:...|2021-01-07 00:00:00|\n",
      "|-1617088793| 1957784035|https://www.zachw...| www.zachwilson.tech|            /contact|2021-01-07 00:19:...|2021-01-07 00:00:00|\n",
      "|-1617088793| 1957784035|https://www.zachw...| www.zachwilson.tech|     /api/v1/contact|2021-01-07 00:19:...|2021-01-07 00:00:00|\n",
      "|-1617088793| 1957784035|https://www.zachw...| www.zachwilson.tech|            /contact|2021-01-07 00:19:...|2021-01-07 00:00:00|\n",
      "| -267976675| 1141939293|https://www.zachw...| www.zachwilson.tech|            /contact|2021-01-07 01:57:...|2021-01-07 00:00:00|\n",
      "| -267976675| 1141939293|https://www.zachw...| www.zachwilson.tech|     /api/v1/contact|2021-01-07 01:57:...|2021-01-07 00:00:00|\n",
      "| -267976675| 1141939293|https://www.zachw...| www.zachwilson.tech|            /contact|2021-01-07 01:57:...|2021-01-07 00:00:00|\n",
      "|  604069040| 1800580819|                NULL| www.zachwilson.tech|                   /|2021-01-07 02:24:...|2021-01-07 00:00:00|\n",
      "|-2135898320| 2066336516|http://www.laurel...| www.zachwilson.tech|                   /|2021-01-07 04:57:...|2021-01-07 00:00:00|\n",
      "|  -43429336|-1217993711|                NULL| www.zachwilson.tech|            /contact|2021-01-07 04:57:...|2021-01-07 00:00:00|\n",
      "| 1198758947|-1307168836|                NULL| www.zachwilson.tech|               /blog|2021-01-07 09:57:...|2021-01-07 00:00:00|\n",
      "|-1894709267| 1141939293|https://www.zachw...| www.zachwilson.tech|                   /|2021-01-07 10:12:...|2021-01-07 00:00:00|\n",
      "+-----------+-----------+--------------------+--------------------+--------------------+--------------------+-------------------+\n",
      "only showing top 20 rows\n",
      "\n",
      "+-----------+-----------+--------------------+--------------------+--------------------+--------------------+-------------------+\n",
      "|    user_id|  device_id|            referrer|                host|                 url|          event_time|         event_date|\n",
      "+-----------+-----------+--------------------+--------------------+--------------------+--------------------+-------------------+\n",
      "| 1272828233| -643696601|                NULL|admin.zachwilson....|                   /|2021-01-02 13:53:...|2021-01-02 00:00:00|\n",
      "|  747494706|  532630305|                NULL|admin.zachwilson....|                   /|2021-01-02 19:36:...|2021-01-02 00:00:00|\n",
      "| 2110046626|  898871897|                NULL|admin.zachwilson....|       /wp-login.php|2021-01-02 19:57:...|2021-01-02 00:00:00|\n",
      "| 1272828233| -643696601|                NULL|admin.zachwilson....|                   /|2021-01-02 21:05:...|2021-01-02 00:00:00|\n",
      "| 1272828233| -643696601|                NULL|admin.zachwilson....|                   /|2021-01-02 21:37:...|2021-01-02 00:00:00|\n",
      "| 1399665425|-2012543895|                NULL|    www.eczachly.com|                   /|2021-01-02 00:20:...|2021-01-02 00:00:00|\n",
      "|  125243313| -290659081|                NULL|    www.eczachly.com|                   /|2021-01-02 02:06:...|2021-01-02 00:00:00|\n",
      "|  632739597| -290659081|                NULL|    www.eczachly.com|/blog/what-exactl...|2021-01-02 02:58:...|2021-01-02 00:00:00|\n",
      "|-1780827820| -290659081|                NULL|    www.eczachly.com|                   /|2021-01-02 04:45:...|2021-01-02 00:00:00|\n",
      "|  632739597| -290659081|                NULL|    www.eczachly.com|                   /|2021-01-02 05:14:...|2021-01-02 00:00:00|\n",
      "| 1047962242| -158310583|                NULL|    www.eczachly.com|                   /|2021-01-02 11:40:...|2021-01-02 00:00:00|\n",
      "|  273700037| -290659081|                NULL|    www.eczachly.com|                   /|2021-01-02 07:51:...|2021-01-02 00:00:00|\n",
      "| 1272828233| -643696601|                NULL|    www.eczachly.com|                   /|2021-01-02 08:14:...|2021-01-02 00:00:00|\n",
      "|  210988258| 1088283544|                NULL|    www.eczachly.com|            /contact|2021-01-02 11:11:...|2021-01-02 00:00:00|\n",
      "|  273700037| -290659081|                NULL|    www.eczachly.com|                   /|2021-01-02 11:23:...|2021-01-02 00:00:00|\n",
      "|  632739597| -290659081|                NULL|    www.eczachly.com|        /sitemap.xml|2021-01-02 14:10:...|2021-01-02 00:00:00|\n",
      "|  659201289| -290659081|                NULL|    www.eczachly.com|/blog/life-of-a-s...|2021-01-02 15:53:...|2021-01-02 00:00:00|\n",
      "| 1072106763| -784483831|https://www.zachw...| www.zachwilson.tech|            /contact|2021-01-02 09:37:...|2021-01-02 00:00:00|\n",
      "| 1744817842| -223216734|                NULL| www.zachwilson.tech|/graph/stock-tick...|2021-01-02 00:23:...|2021-01-02 00:00:00|\n",
      "| 1445055201| 1957784035|https://www.zachw...| www.zachwilson.tech|            /contact|2021-01-02 10:17:...|2021-01-02 00:00:00|\n",
      "+-----------+-----------+--------------------+--------------------+--------------------+--------------------+-------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sorted = df.repartition(10, col(\"event_date\"))\\\n",
    "    .sortWithinPartitions(col(\"event_date\"), col(\"host\"))\\\n",
    "    .withColumn(\"event_time\", col(\"event_time\").cast(\"timestamp\")) \n",
    "\n",
    "sortedTwo = df.repartition(10, col(\"event_date\"))\\\n",
    "    .sort(col(\"event_date\"), col(\"host\"))\\\n",
    "    .withColumn(\"event_time\", col(\"event_time\").cast(\"timestamp\")) \n",
    "\n",
    "sorted.show()\n",
    "sortedTwo.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e75099ce-0e80-4903-8d3e-e4842a9a99f7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# .sortWithinPartitions() sorts within partitions, whereas .sort() is a global sort, which is very slow\n",
    "\n",
    "# Note - exchange is synonymous with Shuffle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "5cc0bb63-2ef0-4a53-af25-881fbd785d55",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "== Physical Plan ==\n",
      "AdaptiveSparkPlan isFinalPlan=false\n",
      "+- Project [user_id#17, device_id#18, referrer#19, host#20, url#21, cast(event_time#22 as timestamp) AS event_time#288, event_date#29]\n",
      "   +- Sort [event_date#29 ASC NULLS FIRST, host#20 ASC NULLS FIRST], false, 0\n",
      "      +- Exchange hashpartitioning(event_date#29, 10), REPARTITION_BY_NUM, [plan_id=294]\n",
      "         +- Project [user_id#17, device_id#18, referrer#19, host#20, url#21, event_time#22, date_trunc(day, cast(event_time#22 as timestamp), Some(Etc/UTC)) AS event_date#29]\n",
      "            +- FileScan csv [user_id#17,device_id#18,referrer#19,host#20,url#21,event_time#22] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/iceberg/data/events.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<user_id:string,device_id:string,referrer:string,host:string,url:string,event_time:string>\n",
      "\n",
      "\n",
      "== Physical Plan ==\n",
      "AdaptiveSparkPlan isFinalPlan=false\n",
      "+- Project [user_id#17, device_id#18, referrer#19, host#20, url#21, cast(event_time#22 as timestamp) AS event_time#296, event_date#29]\n",
      "   +- Sort [event_date#29 ASC NULLS FIRST, host#20 ASC NULLS FIRST], true, 0\n",
      "      +- Exchange rangepartitioning(event_date#29 ASC NULLS FIRST, host#20 ASC NULLS FIRST, 200), ENSURE_REQUIREMENTS, [plan_id=316]\n",
      "         +- Exchange hashpartitioning(event_date#29, 10), REPARTITION_BY_NUM, [plan_id=312]\n",
      "            +- Project [user_id#17, device_id#18, referrer#19, host#20, url#21, event_time#22, date_trunc(day, cast(event_time#22 as timestamp), Some(Etc/UTC)) AS event_date#29]\n",
      "               +- FileScan csv [user_id#17,device_id#18,referrer#19,host#20,url#21,event_time#22] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/iceberg/data/events.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<user_id:string,device_id:string,referrer:string,host:string,url:string,event_time:string>\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sorted = df.repartition(10, col(\"event_date\"))\\\n",
    "    .sortWithinPartitions(col(\"event_date\"), col(\"host\"))\\\n",
    "    .withColumn(\"event_time\", col(\"event_time\").cast(\"timestamp\")) \n",
    "\n",
    "sortedTwo = df.repartition(10, col(\"event_date\"))\\\n",
    "    .sort(col(\"event_date\"), col(\"host\"))\\\n",
    "    .withColumn(\"event_time\", col(\"event_time\").cast(\"timestamp\")) \n",
    "\n",
    "sorted.explain()\n",
    "sortedTwo.explain()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "d800dca7-2737-4192-b5c0-c1806c105e15",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table>\n",
       "    <thead>\n",
       "        <tr>\n",
       "        </tr>\n",
       "    </thead>\n",
       "    <tbody>\n",
       "    </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "++\n",
       "||\n",
       "++\n",
       "++"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%sql\n",
    "\n",
    "CREATE DATABASE IF NOT EXISTS bootcamp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "e83cd813-d5c0-4d67-8285-849b882b8bfa",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table>\n",
       "    <thead>\n",
       "        <tr>\n",
       "        </tr>\n",
       "    </thead>\n",
       "    <tbody>\n",
       "    </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "++\n",
       "||\n",
       "++\n",
       "++"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%sql\n",
    "\n",
    "DROP TABLE IF EXISTS bootcamp.events"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "b403139c-f342-426a-a6de-81cd86d1cd73",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table>\n",
       "    <thead>\n",
       "        <tr>\n",
       "        </tr>\n",
       "    </thead>\n",
       "    <tbody>\n",
       "    </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "++\n",
       "||\n",
       "++\n",
       "++"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%sql\n",
    "\n",
    "DROP TABLE IF EXISTS bootcamp.events_sorted"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "d1b197a9-1b63-4130-acbe-01418eede0e5",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table>\n",
       "    <thead>\n",
       "        <tr>\n",
       "        </tr>\n",
       "    </thead>\n",
       "    <tbody>\n",
       "    </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "++\n",
       "||\n",
       "++\n",
       "++"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%sql\n",
    "\n",
    "CREATE TABLE IF NOT EXISTS bootcamp.events (\n",
    "    url STRING,\n",
    "    referrer STRING,\n",
    "    browser_family STRING,\n",
    "    os_family STRING,\n",
    "    device_family STRING,\n",
    "    host STRING,\n",
    "    event_time TIMESTAMP,\n",
    "    event_date DATE\n",
    ")\n",
    "USING iceberg\n",
    "PARTITIONED BY (years(event_date));\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "c40b143f-295e-4875-bd7f-12409312b800",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table>\n",
       "    <thead>\n",
       "        <tr>\n",
       "        </tr>\n",
       "    </thead>\n",
       "    <tbody>\n",
       "    </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "++\n",
       "||\n",
       "++\n",
       "++"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%sql\n",
    "\n",
    "\n",
    "CREATE TABLE IF NOT EXISTS bootcamp.events_sorted (\n",
    "    url STRING,\n",
    "    referrer STRING,\n",
    "    browser_family STRING,\n",
    "    os_family STRING,\n",
    "    device_family STRING,\n",
    "    host STRING,\n",
    "    event_time TIMESTAMP,\n",
    "    event_date DATE\n",
    ")\n",
    "USING iceberg\n",
    "PARTITIONED BY (years(event_date));"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "00c86e79-a911-464c-ad58-acc92859dcc6",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table>\n",
       "    <thead>\n",
       "        <tr>\n",
       "        </tr>\n",
       "    </thead>\n",
       "    <tbody>\n",
       "    </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "++\n",
       "||\n",
       "++\n",
       "++"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%sql\n",
    "\n",
    "\n",
    "CREATE TABLE IF NOT EXISTS bootcamp.events_unsorted (\n",
    "    url STRING,\n",
    "    referrer STRING,\n",
    "    browser_family STRING,\n",
    "    os_family STRING,\n",
    "    device_family STRING,\n",
    "    host STRING,\n",
    "    event_time TIMESTAMP,\n",
    "    event_date DATE\n",
    ")\n",
    "USING iceberg\n",
    "PARTITIONED BY (year(event_date));"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "2c1254bc-9ecf-4c86-bfd9-de81ecfbb78b",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                                                \r"
     ]
    }
   ],
   "source": [
    "\n",
    "start_df = df.repartition(4, col(\"event_date\")).withColumn(\"event_time\", col(\"event_time\").cast(\"timestamp\")) \\\n",
    "    \n",
    "first_sort_df = start_df.sortWithinPartitions(col(\"event_date\"), col(\"host\"))\n",
    "\n",
    "start_df.write.mode(\"overwrite\").saveAsTable(\"bootcamp.events_unsorted\")\n",
    "first_sort_df.write.mode(\"overwrite\").saveAsTable(\"bootcamp.events_sorted\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "id": "d207a11b-b0a4-4662-bbe9-747d8f67be7d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table>\n",
       "    <tr>\n",
       "        <th>size</th>\n",
       "        <th>num_files</th>\n",
       "        <th>sorted</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "        <td>2896920</td>\n",
       "        <td>4</td>\n",
       "        <td>sorted</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "        <td>3211534</td>\n",
       "        <td>4</td>\n",
       "        <td>unsorted</td>\n",
       "    </tr>\n",
       "</table>"
      ],
      "text/plain": [
       "+---------+-----------+----------+\n",
       "|    size | num_files |   sorted |\n",
       "+---------+-----------+----------+\n",
       "| 2896920 |         4 |   sorted |\n",
       "| 3211534 |         4 | unsorted |\n",
       "+---------+-----------+----------+"
      ]
     },
     "execution_count": 121,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%sql\n",
    "\n",
    "SELECT SUM(file_size_in_bytes) as size, COUNT(1) as num_files, 'sorted' \n",
    "FROM demo.bootcamp.events_sorted.files\n",
    "\n",
    "UNION ALL\n",
    "SELECT SUM(file_size_in_bytes) as size, COUNT(1) as num_files, 'unsorted' \n",
    "FROM demo.bootcamp.events_unsorted.files\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "id": "a93db4d6-ac15-4d0e-83da-77b93ad618da",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table>\n",
       "    <tr>\n",
       "        <th>size</th>\n",
       "        <th>num_files</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "        <td>3145713</td>\n",
       "        <td>5</td>\n",
       "    </tr>\n",
       "</table>"
      ],
      "text/plain": [
       "+---------+-----------+\n",
       "|    size | num_files |\n",
       "+---------+-----------+\n",
       "| 3145713 |         5 |\n",
       "+---------+-----------+"
      ]
     },
     "execution_count": 90,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%sql\n",
    "SELECT SUM(file_size_in_bytes) as size, COUNT(1) as num_files FROM demo.bootcamp.events.files;"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    },
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "%%sql \n",
    "SELECT COUNT(1) FROM bootcamp.matches_bucketed.files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "674f173d-86ca-4637-9332-37ce04a81451",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table>\n",
       "    <tr>\n",
       "        <th>count(1)</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "        <td>3665</td>\n",
       "    </tr>\n",
       "</table>"
      ],
      "text/plain": [
       "+----------+\n",
       "| count(1) |\n",
       "+----------+\n",
       "|     3665 |\n",
       "+----------+"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "591ba0b7-2e6f-420a-960d-95e5734b0166",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
